cap log close
clear all
set more off
set matsize 11000

*Main data pathway globals, set directory to clean data folder
global results "S:\Project\DemoSos2\common\felles\JR_RG\DrVA\ResultsRev2\"
global cleandata "S:\Project\DemoSos2\common\felles\JR_RG\DrVA\CleanData\"

cd "$cleandata"

*---------------------------------------------------------------
* 1 - get main diagnoses
*---------------------------------------------------------------
foreach num of numlist 2006/2014 {
use lopenr year diagnose age TypeDiag docid using "kurh_`num'.dta", clear
keep if TypeDiag ==  "ICPC-2" 
drop TypeDiag
keep if age >= 55 & age!=.

*keep randomly 50% of sample
ge test = substr(lopenr,-1,1)
destring test, replace
drop if test == 1 | test == 3 | test == 5 | test == 7 | test == 9
drop test

*diagnoses
foreach x in A D F H K L N P S T U W X Y {	
	foreach d in 0 1 2 7 8 9 {
		ge d_`x'`d'`v' = (regexm(diagnose , "`x'`d'")) 
		egen test = sum(d_`x'`d')
		if test == 0 {
				drop d_`x'`d'
		}
		drop test			 
	}
}

foreach x in B R {	
	foreach d in 0 2 7 8 9 {
		ge d_`x'`d'`v' = (regexm(diagnose , "`x'`d'")) 
		egen test = sum(d_`x'`d')
		if test == 0 {
				drop d_`x'`d'
		}
		drop test	
	}
}

foreach x in Z {	
	foreach d in 0 1 2  {
		ge d_`x'`d'`v' = (regexm(diagnose , "`x'`d'")) 
		egen test = sum(d_`x'`d')
		if test == 0 {
				drop d_`x'`d'
		}
		drop test	
	}
}

* most frequent gp
bys lopenr: egen gp = mode(docid)
collapse gp (sum) d_*, by(lopenr year)
save diagnoses_y`num', replace
}


use lopenr year npr_cancer_c if year <= 2015 using npr, clear
* get date of birth to construct age
merge m:1 lopenr using "S:\Project\DemoSos2\data2020\Befolkn\Demogr\faste_oppl", keepusing(foedselsaar kjoenn)
drop _m

* drop individuals that could not enter the hospital because they are dead
merge m:1 lopenr using "mortality_rev.dta", keepusing(yr_death)
drop _merge
drop if yr_death < 2008
ge age = year-foedselsaar 
keep if age >= 55 & age!=.

*keep randomly 50% of sample
ge test = substr(lopenr,-1,1)
destring test, replace
drop if test == 1 | test == 3 | test == 5 | test == 7 | test == 9
drop test

* merge with previous year primary care diagnoses
replace year=year-1
save npr_forlasso, replace



use diagnoses_y2007, clear
for num 2008/2014: append using diagnoses_yX
foreach x in A D F H K L N P S T U W X Y {	
	foreach d in 0 1 2 7 8 9 {
		cap egen _`x'`d' = rsum(d_`x'`d'*)
		cap drop d_`x'`d'0 
		cap drop d_`x'`d'1 
		cap drop d_`x'`d'2 
		cap drop d_`x'`d'3 
		cap drop d_`x'`d'4 
		cap drop d_`x'`d'5 
		cap drop d_`x'`d'6 
		cap drop d_`x'`d'7 
		cap drop d_`x'`d'8 
		cap drop d_`x'`d'9
	}
}

foreach x in B R {	
	foreach d in 0 2 7 8 9 {
		cap egen _`x'`d' = rsum(d_`x'`d'*)  
		cap drop d_`x'`d'0 
		cap drop d_`x'`d'1 
		cap drop d_`x'`d'2 
		cap drop d_`x'`d'3 
		cap drop d_`x'`d'4 
		cap drop d_`x'`d'5 
		cap drop d_`x'`d'6 
		cap drop d_`x'`d'7 
		cap drop d_`x'`d'8 
		cap drop d_`x'`d'9
	}
}

cap drop _K1
compress
merge 1:1 lopenr year using npr_forlasso
drop _merge
fillin lopenr year


*diagnoses
foreach x in A D F H K L N P S T U W X Y {	
	foreach d in 0 1 2 7 8 9 {
		cap replace _`x'`d' = 0 if _`x'`d'==.		 
	}
}


foreach x in B K R {	
	foreach d in 0 2 7 8 9 {
		cap replace _`x'`d' = 0 if d_`x'`d'==.	
	}
}

replace npr_cancer_c=0 if npr_cancer_c==.
ge dnpr_cancer_c = npr_cancer_c > 0
ge _male = kjoenn == "1"
bys lopenr: egen male = max(_male)
drop _male kjoenn
ge age2 = age*age

save temp_forlasso, replace

use lopenr yr_str_exog_swap str_exogGPIDnew str_exog_age using  patientlevel_rev_extended, clear
keep if yr_str_exog_swap >= 2005 & yr_str_exog_swap<=2014
keep if str_exog_age>=55
keep if str_exogGPIDnew!=. 
keep lopenr
duplicates drop
save temp_inswap, replace


use temp_forlasso, clear
* make sure that for the lasso we don't use individuals involved in the swaps
merge m:1 lopenr using temp_inswap
keep if _m ==1
log using lasso_ap18, text replace
tab year, ge(dyear_)
lasso linear dnpr_cancer_c age dyear_* _A* _D* _F* _H* _K* _L* _N* _P* _S* _T* _U* _X* _Y*, rseed(1234)
estimates store cancer_p
lassocoef cancer_p, display(coef,penalized) sort(coef,penalized)
log close

erase temp_forlasso.dta
erase temp_inswap.dta

*---------------------------------------------------------------
* 2 - check if high VA docs also are more likely to detect the most freq diagnoses associated to cancer
*---------------------------------------------------------------
foreach num of numlist 2006/2016 {
use lopenr year diagnose age TypeDiag docid using "kurh_`num'.dta", clear
keep if TypeDiag ==  "ICPC-2" 
drop TypeDiag
keep if age >= 55 & age!=.

*diagnoses
foreach x in A D F H K L N P S T U W X Y {	
	foreach d in 0 1 2 7 8 9 {
		ge d_`x'`d' = (regexm(diagnose , "`x'`d'")) 
		egen test = sum(d_`x'`d')
		if test == 0 {
				drop d_`x'`d'
		}
		drop test			 
	}
}

foreach x in B R {	
	foreach d in 0 2 7 8 9 {
		ge d_`x'`d' = (regexm(diagnose , "`x'`d'")) 
		egen test = sum(d_`x'`d')
		if test == 0 {
				drop d_`x'`d'
		}
		drop test	
	}
}

foreach x in Z {	
	foreach d in 0 1 2  {
		ge d_`x'`d' = (regexm(diagnose , "`x'`d'")) 
		egen test = sum(d_`x'`d')
		if test == 0 {
				drop d_`x'`d'
		}
		drop test	
	}
}

collapse (sum) d_*, by(lopenr year)
save diagnoses_forall_y`num', replace
}

* merge to sample of analysis
use diagnoses_forall_y2006, clear
for num 2007/2016: append using diagnoses_forall_yX
local num = 1 
	foreach var of varlist _all {
		rename `var' `var'`num'
	}
rename lopenr`num'	lopenr	
merge 1:1 lopenr year1 using  patientlevel_rev_extended, keep(1 3) nogen
keep if yr_str_exog_swap >= 2005 & yr_str_exog_swap<=2014
keep if str_exogGPIDnew!=. 

* largest mobility group 
a2group, individual(str_exogGPIDnew) unit(str_exogGPIDprev) groupvar(pair)
bys pair: ge size_pair = _N
tab pair
drop if size_pair < 10 ///largest mobility group has 99%

merge 1:1 str_exogGPIDnew lopenr  using "$results/fe_addon_lo2.dta"
drop _merge

ge death5 = mortality_5year
ge death2 = mortality_2year
for num 25 35 45 55: ge death5_X = mortality_5year if str_exog_age>=X
for num 25 35 45 55: ge death2_X = mortality_2year if str_exog_age>=X


bys str_exogGPIDnew: ge n = _n
* recode main VA measure (more VA - better doc, less mortality) 
ge newva = -afe_death2_55
local va "newva" 
su `va', d
gen stdva = (`va'-r(mean))/r(sd)
local va "stdva"
lab var stdva "Standardized VA"

keep if str_exogGPIDnew!=. & stdva!=. & str_exog_age>=55
global controls "i.yr_str_exog_swap i.str_exog_age male" 

* Table A16
egen d_XY7 = rmax(d_X7  d_Y7)
foreach var in d_D7 d_T7 d_A7 d_XY7 {
	eststo d`var': areg `var' stdva $controls if  str_exog_age>=55 ,  a(str_exogGPIDprev) cl(str_exogGPIDnew) 
	su `var' if e(sample)
	estadd scalar mean = r(mean)
	eststo `var'
}

esttab d_D7 d_T7 d_A7 d_XY7 using "$results/lasso.tex",  booktabs legend replace ///
	label b(3) se(3) star(* 0.1 ** 0.05 *** 0.01) stats(emp mean N, fmt(%9.3g)) keep(stdva) ///
	mtitles(d_D7 d_T7 d_A7 d_XY7) 	
	
	
	